Exercise: Web Crawler

#Golang

code:solution.go

package main

import (

"fmt"

"sync"

)

type Fetcher interface {

// Fetch returns the body of URL and

// a slice of URLs found on that page.

Fetch(url string) (body string, urls []string, err error)

}

type Cache struct {

visited mapstringbool

mux sync.Mutex

}

func NewCache() *Cache {

return &Cache{visited: make(mapstringbool)}

}

func (c *Cache) Visit(url string) {

c.mux.Lock()

c.visitedurl = true

c.mux.Unlock()

}

func (c *Cache) IsVisited(url string) bool {

c.mux.Lock()

defer c.mux.Unlock()

return c.visitedurl

}

// var visited = make(mapstringbool)

var c = NewCache()

// Crawl uses fetcher to recursively crawl

// pages starting with url, to a maximum of depth.

func Crawl(url string, depth int, fetcher Fetcher) {

// TODO: Fetch URLs in parallel.

// TODO: Don't fetch the same URL twice.

// This implementation doesn't do either:

if depth <= 0 {

return

}

body, urls, err := fetcher.Fetch(url)

// visitedurl = true

c.Visit(url)

if err != nil {

fmt.Println(err)

return

}

fmt.Printf("found: %s %q\n", url, body)

for _, u := range urls {

if !c.IsVisited(u) {

Crawl(u, depth-1, fetcher)

}

return

}

func main() {

Crawl("https://golang.org/", 4, fetcher)

}

// fakeFetcher is Fetcher that returns canned results.

type fakeFetcher mapstring*fakeResult

type fakeResult struct {

body string

urls []string

}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {

if res, ok := furl; ok {

return res.body, res.urls, nil

}

return "", nil, fmt.Errorf("not found: %s", url)

}

// fetcher is a populated fakeFetcher.

var fetcher = fakeFetcher{

"https://golang.org/": &fakeResult{

"The Go Programming Language",

[]string{

"https://golang.org/pkg/",

"https://golang.org/cmd/",

"https://golang.org/pkg/": &fakeResult{

"Packages",

[]string{

"https://golang.org/",

"https://golang.org/cmd/",

"https://golang.org/pkg/fmt/",

"https://golang.org/pkg/os/",

"https://golang.org/pkg/fmt/": &fakeResult{

"Package fmt",

[]string{

"https://golang.org/",

"https://golang.org/pkg/",

"https://golang.org/pkg/os/": &fakeResult{

"Package os",

[]string{

"https://golang.org/",

"https://golang.org/pkg/",

}